#########################################
###       Combine MTO Datasets        ###
#########################################

library(dplyr)
library(tidyr)
library(readstata13)
library(here)

i_am("1_combine_data.R")

# create dataset: keep only one match ####

# read in all datasets 
a_mto_fweights_vote_pseudo	<- read.dta13(here("Data", "a_mto_fweights_vote_pseudo.dta"))
b_mto_vote_match5_pseudo	<- read.dta13(here("Data","b_mto_vote_match5_pseudo_v2.dta")) #This data is the most recent one. 
d_mto_base_hhld_pseudo	<- read.dta13(here("Data","d_mto_base_hhld_pseudo.dta")) #baseline household survey
e_mto_base_pers_pseudo	<- read.dta13(here("Data","e_mto_base_pers_pseudo.dta")) #baseline person info
f_mto_int_rawsvy_ad_pseudo	<- read.dta13(here("Data","f_mto_int_rawsvy_ad_pseudo.dta")) #interim adult raw survey data
h_mto_fin_rawsvy_ad_pseudo	<- read.dta13(here("Data","h_mto_fin_rawsvy_ad_pseudo.dta")) #final adult raw survey data
i_mto_fin_rawsvy_yt_pseudo	<- read.dta13(here("Data","i_mto_fin_rawsvy_yt_pseudo.dta")) #final youth raw survey data
j_mto_fin_roster_pseudo	<- read.dta13(here("Data","j_mto_fin_roster_pseudo.dta")) #final roster data
k_mto_int_adult_analysis_pseudo	<- read.dta13(here("Data","k_mto_int_adult_analysis_pseudo.dta")) #interim adult analysis file 
l_mto_int_chyt_analysis_pseudo	<- read.dta13(here("Data","l_mto_int_adult_analysis_pseudo.dta"))  #interim child youth analysis file 
m_mto_int_roster_pseudo	<- read.dta13(here("Data","m_mto_int_roster_pseudo.dta")) #interim roster
n_mto_fin_analysis_pseudo	<- read.dta13(here("Data","n_mto_fin_analysis_pseudo.dta")) #final analysis file

#create treatment group indicator
a_mto_fweights_vote_pseudo$ra_group_factor<-NA
a_mto_fweights_vote_pseudo$ra_group_factor[a_mto_fweights_vote_pseudo$ra_group==1]<-"experimental"
a_mto_fweights_vote_pseudo$ra_group_factor[a_mto_fweights_vote_pseudo$ra_group==2]<-"section 8"
a_mto_fweights_vote_pseudo$ra_group_factor[a_mto_fweights_vote_pseudo$ra_group==3]<-"control"


# merge datasets
dat <- full_join(a_mto_fweights_vote_pseudo, 
                 b_mto_vote_match5_pseudo,
                 suffix=c("a", "b"))
dat <- full_join(dat, 
                 d_mto_base_hhld_pseudo,
                 suffix=c("ab", "d"))
dat <- full_join(dat, 
                 e_mto_base_pers_pseudo,
                 suffix=c("abd", "e"))
dat <- full_join(dat, 
                 f_mto_int_rawsvy_ad_pseudo,
                 suffix=c("abde", "f"))
dat <- full_join(dat, 
                 h_mto_fin_rawsvy_ad_pseudo,
                 suffix=c("abdef", "h"))
dat <- full_join(dat, 
                 i_mto_fin_rawsvy_yt_pseudo,
                 suffix=c("atoh", "i"))
dat <- full_join(dat, 
                 j_mto_fin_roster_pseudo,
                 suffix=c("atoi", "j"))
dat <- full_join(dat, 
                 k_mto_int_adult_analysis_pseudo,
                 suffix=c("atoj", "k"))
dat <- full_join(dat, 
                 l_mto_int_chyt_analysis_pseudo,
                 suffix=c("atok", "l"))
dat <- full_join(dat, 
                 m_mto_int_roster_pseudo,
                 suffix=c("atol", "m"))
dat <- full_join(dat, 
                 n_mto_fin_analysis_pseudo,
                 suffix=c("atom", "n"))

withdate <- read.dta13(here("Data","new_pseudo_id_ra_year.dta"))
dat <- full_join(dat, withdate)

# remove observations without a treatment condition (i.e. didn't live in hh at time of assignment)
dat <- dat %>% filter(!is.na(ra_group))
# create indicator for whether an obs was matched
dat <- dat %>% 
  mutate(matched = case_when(is.na(posterior)~0,
                             T~1))

# code impossible matches (i.e.)if someone voted before age 18, set all to 0
dat$badmatch <- ifelse(!is.na(dat$r_pretreatturnout)&dat$age_group!="adult", 1, 0)
# set all turnout variables to missing if a bad match
dat <- dat %>%
  mutate(across(c(posterior:r_postregturnout), ~ case_when(badmatch==1~.[NA],
                                                           T~.)))

# create person-level dataset summarizing number/quality of matches
inds <- dat %>%
  group_by(mto_pseudo_id) %>%
  dplyr::summarize(n=sum(matched),
                   bad=sum(badmatch),
                   good=n-bad,
                   max_post = max(posterior, na.rm=T))

# update indicator for whether a person was matched (now excluding bad matches)
dat <- dat %>% 
  mutate(matched = case_when(is.na(posterior)~0,
                             T~1))

# add measures of match number/quality to full dataset
dat <- dat %>%
  group_by(mto_pseudo_id) %>%
  mutate(max_post = max(posterior, na.rm=T),
         n=sum(matched)) %>%
  mutate(max_post = case_when(is.infinite(max_post)~NA_real_,
                              T~max_post))

# create separate frames for obs with no, 1, multiple matches:
# keep each distinct observation with no matches
none <- dat %>% filter(n<1) %>% distinct(across(mto_pseudo_id:matched), .keep.all=TRUE)
# keep each distinct observation with one match
one <- dat %>% filter(n==1&!is.na(posterior)) %>% distinct(across(mto_pseudo_id:matched), .keep.all=TRUE)
# for obs with multiple matches, keep only those with the top posterior and grab one to keep
mult <- dat %>% 
  filter(n>1&posterior==max_post) %>%
  group_by(mto_pseudo_id) %>%
  slice_sample(n=1)
# recombine 0/1/+ match datasets
dat <- bind_rows(none, one, mult)
# remove variables used only for match selection
dat <- dat %>% dplyr::select(-c(matched:n))

# export dataset
write.csv(dat, "combined_dataset.csv")


# create dataset: keep all possible matches ####

# import all datasets
a_mto_fweights_vote_pseudo	<- read.dta13(here("Data","a_mto_fweights_vote_pseudo.dta"))
b_mto_vote_match5_pseudo	<- read.dta13(here("Data","b_mto_vote_match5_pseudo_v2.dta")) #This data is the most recent one. 
d_mto_base_hhld_pseudo	<- read.dta13(here("Data","d_mto_base_hhld_pseudo.dta")) #baseline household survey
e_mto_base_pers_pseudo	<- read.dta13(here("Data","e_mto_base_pers_pseudo.dta")) #baseline person info
f_mto_int_rawsvy_ad_pseudo	<- read.dta13(here("Data","f_mto_int_rawsvy_ad_pseudo.dta")) #interim adult raw survey data
h_mto_fin_rawsvy_ad_pseudo	<- read.dta13(here("Data","h_mto_fin_rawsvy_ad_pseudo.dta")) #final adult raw survey data
i_mto_fin_rawsvy_yt_pseudo	<- read.dta13(here("Data","i_mto_fin_rawsvy_yt_pseudo.dta")) #final youth raw survey data
j_mto_fin_roster_pseudo	<- read.dta13(here("Data","j_mto_fin_roster_pseudo.dta")) #final roster data
k_mto_int_adult_analysis_pseudo	<- read.dta13(here("Data","k_mto_int_adult_analysis_pseudo.dta")) #interim adult analysis file 
l_mto_int_chyt_analysis_pseudo	<- read.dta13(here("Data","l_mto_int_adult_analysis_pseudo.dta")) #interim child youth analysis file 
m_mto_int_roster_pseudo	<- read.dta13(here("Data","m_mto_int_roster_pseudo.dta")) #interim roster
n_mto_fin_analysis_pseudo	<- read.dta13(here("Data","n_mto_fin_analysis_pseudo.dta")) #final analysis file

# create treatment indicator
a_mto_fweights_vote_pseudo$ra_group_factor<-NA
a_mto_fweights_vote_pseudo$ra_group_factor[a_mto_fweights_vote_pseudo$ra_group==1]<-"experimental"
a_mto_fweights_vote_pseudo$ra_group_factor[a_mto_fweights_vote_pseudo$ra_group==2]<-"section 8"
a_mto_fweights_vote_pseudo$ra_group_factor[a_mto_fweights_vote_pseudo$ra_group==3]<-"control"

# join datasets together 
dat <- full_join(a_mto_fweights_vote_pseudo, 
                 b_mto_vote_match5_pseudo,
                 suffix=c("a", "b"))
dat <- full_join(dat, 
                 d_mto_base_hhld_pseudo,
                 suffix=c("ab", "d"))
dat <- full_join(dat, 
                 e_mto_base_pers_pseudo,
                 suffix=c("abd", "e"))
dat <- full_join(dat, 
                 f_mto_int_rawsvy_ad_pseudo,
                 suffix=c("abde", "f"))
dat <- full_join(dat, 
                 h_mto_fin_rawsvy_ad_pseudo,
                 suffix=c("abdef", "h"))
dat <- full_join(dat, 
                 i_mto_fin_rawsvy_yt_pseudo,
                 suffix=c("atoh", "i"))
dat <- full_join(dat, 
                 j_mto_fin_roster_pseudo,
                 suffix=c("atoi", "j"))
dat <- full_join(dat, 
                 k_mto_int_adult_analysis_pseudo,
                 suffix=c("atoj", "k"))
dat <- full_join(dat, 
                 l_mto_int_chyt_analysis_pseudo,
                 suffix=c("atok", "l"))
dat <- full_join(dat, 
                 m_mto_int_roster_pseudo,
                 suffix=c("atol", "m"))
dat <- full_join(dat, 
                 n_mto_fin_analysis_pseudo,
                 suffix=c("atom", "n"))

withdate <- read.dta13(here("Data","new_pseudo_id_ra_year.dta"))
dat <- full_join(dat, withdate)

# remove bad matches
dat <- dat %>% filter(!is.na(ra_group))
dat <- dat %>% 
  mutate(matched = case_when(is.na(posterior)~0,
                             T~1))


# create indicator for bad matches
dat$badmatch <- ifelse(!is.na(dat$r_pretreatturnout)&dat$age_group!="adult", 1, 0)

# create summaries of match number/quality
inds <- dat %>%
  group_by(mto_pseudo_id) %>%
  dplyr::summarize(n=sum(matched),
                   bad=sum(badmatch),
                   good=n-bad,
                   max_post = max(posterior, na.rm=T))

# set all match/turnout variables to missing for bad matches
dat <- dat %>%
  mutate(across(c(posterior:r_postregturnout), ~ case_when(badmatch==1~.[NA],
                                                           T~.)))

# update indicator for whether matched 
dat <- dat %>% 
  mutate(matched = case_when(is.na(posterior)~0,
                             T~1))

# remove match number/quality indicators
dat <- dat %>% dplyr::select(-c(matched:badmatch))

# save out dataset
write.csv(dat, "combined_dataset_withdups.csv")
